DIAMOND DATASET EDA(Exploratory Data Analyis) Using Plotly¶

In [46]:
import pandas as pd
import plotly.express as px
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
In [47]:
# Load the diamond dataset

df = pd.read_csv("diamonds.csv")
In [48]:
df.head()
Out[48]:
Unnamed: 0 carat cut color clarity depth table price x y z
0 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
1 2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
2 3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
3 4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
4 5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
In [49]:
df.columns
Out[49]:
Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',
       'price', 'x', 'y', 'z'],
      dtype='object')

Data Cleaning Method, dropping the column 'Unnamed: 0' fron the dataset.¶

In [50]:
 df.drop(['Unnamed: 0'], axis=1, inplace=True)
In [51]:
df.head()
Out[51]:
carat cut color clarity depth table price x y z
0 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43
1 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31
2 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31
3 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63
4 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75
In [52]:
df.isnull().sum()
Out[52]:
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64
In [53]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB
In [54]:
df.describe().T
Out[54]:
count mean std min 25% 50% 75% max
carat 53940.0 0.797940 0.474011 0.2 0.40 0.70 1.04 5.01
depth 53940.0 61.749405 1.432621 43.0 61.00 61.80 62.50 79.00
table 53940.0 57.457184 2.234491 43.0 56.00 57.00 59.00 95.00
price 53940.0 3932.799722 3989.439738 326.0 950.00 2401.00 5324.25 18823.00
x 53940.0 5.731157 1.121761 0.0 4.71 5.70 6.54 10.74
y 53940.0 5.734526 1.142135 0.0 4.72 5.71 6.54 58.90
z 53940.0 3.538734 0.705699 0.0 2.91 3.53 4.04 31.80

Data Visualization¶

Animated Visualization of cut vs clarity over depth¶

In [55]:
import pandas as pd
import plotly.express as px

# Step 1: Import necessary libraries

# Step 2: Load diamond dataset into a DataFrame
df = pd.read_csv("diamonds.csv")

# Step 3: Create an animated line plot of cut vs clarity over depth
fig = px.line(df,
                 x="cut",
                 y="clarity",
                 animation_frame="depth",
                 range_x=[0, 5],
                 range_y=[0, 20000])

# Step 4: Add customizations if needed

# Show the interactive animation plot
fig.show()
In [56]:
import pandas as pd
import plotly.express as px

# Step 1: Import necessary libraries

# Step 2: Load diamond dataset into a DataFrame
df = pd.read_csv("diamonds.csv")

# Step 3: Create an animated scatter plot of carat vs price over time
fig = px.scatter(df,
                 x="carat",
                 y="cut",
                 animation_frame="table",
                 range_x=[0, 5],
                 range_y=[0, 20000])

# Step 4: Add customizations if needed

# Show the interactive animation plot
fig.show()
In [57]:
import plotly.express as px

# Create heatmap using Plotly Express
fig = px.imshow(
  df.corr(),
  color_continuous_scale="Inferno_r",
)

# Show plot
fig.show()
C:\Users\alexa\AppData\Local\Temp\ipykernel_19288\2978107106.py:5: FutureWarning:

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.

Carat vs Price¶

-- Ordinary Least Squares (OLS) trendline function. Requires statsmodels to be installed. This trendline function causes fit results to be stored within the figure, accessible via the plotly. express. get_trendline_results function.

In [58]:
fig=px.scatter(df, x='carat', y='price',trendline="ols",title='Carat Vs Price',
              color_discrete_sequence=['red'],template='plotly_dark')
fig.show()

Cut Vs Price¶

In [59]:
cut_price = df.groupby("cut").price.mean().reset_index()
px.bar(cut_price, x="cut", y='price',color='cut',title='Cut Vs Prices')

Color Vs Prices¶

In [60]:
color_price = df.groupby("color").price.mean().reset_index()
px.bar(color_price, x='color', y='price',color='color',template='ggplot2',title='Color Vs Prices')

Clarity vs. Price¶

In [61]:
clarity_price = df.groupby("clarity").price.mean().reset_index()
clarity_price.head()
Out[61]:
clarity price
0 I1 3924.168691
1 IF 2864.839106
2 SI1 3996.001148
3 SI2 5063.028606
4 VS1 3839.455391
In [62]:
px.bar(clarity_price, x='clarity', y='price',color_discrete_sequence=['green'],title='Clarity vs. Price')

Depth vs Price¶

In [63]:
px.scatter(df, x='depth', y='price',color_discrete_sequence=['orange'],trendline="ols",template='plotly_dark',title='Depth Vs Price')

IMPORTANT FEATURES VISUALIZATION¶

In [64]:
fig = px.scatter(df, x="carat", y="price", trendline="ols",color="color",
                title="Carat Vs Color Vs Price")
fig.show()

Create volume of diamonds that contains x y z¶

In [65]:
df["volume"] = df["x"] * df["y"] * df["z"]
df = df.drop(["x", "y", "z"], axis= 1)
df = df.drop(df.index[df["volume"]== 0], axis= 0)
In [66]:
df.head()
Out[66]:
Unnamed: 0 carat cut color clarity depth table price volume
0 1 0.23 Ideal E SI2 61.5 55.0 326 38.202030
1 2 0.21 Premium E SI1 59.8 61.0 326 34.505856
2 3 0.23 Good E VS1 56.9 65.0 327 38.076885
3 4 0.29 Premium I VS2 62.4 58.0 334 46.724580
4 5 0.31 Good J SI2 63.3 58.0 335 51.917250
In [67]:
figure = px.scatter(data_frame = df, x="carat",
                    y="price", size="depth", 
                    color= "cut", trendline="ols")
figure.show()

Prices of all the types of diamonds based on their colour:¶

In [68]:
fig = px.box(df, x="cut", 
             y="price", 
             color="color")
fig.show()

Prices of all the types of diamonds based on their clarity:¶

In [69]:
fig = px.box(df, 
             x="cut", 
             y="price", 
             color="clarity")
fig.show()
In [70]:
correlation = df.corr()
print(correlation["price"].sort_values(ascending=False))
price         1.000000
carat         0.921592
volume        0.904255
table         0.127245
depth        -0.010729
Unnamed: 0   -0.307092
Name: price, dtype: float64
C:\Users\alexa\AppData\Local\Temp\ipykernel_19288\3962303493.py:1: FutureWarning:

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.

Diamond Price Prediction¶

In [71]:
import pandas as pd
import plotly.express as px

# Step 1: Import necessary libraries

# Step 2: Load diamond dataset into a DataFrame
df = pd.read_csv("diamonds.csv")

# Step 3: Create an animated scatter plot of carat vs price over time
fig = px.scatter(df,
                 x="carat",
                 y="price",
                 animation_frame="cut",
                 range_x=[0, 5],
                 range_y=[0, 20000])

# Step 4: Add customizations if needed

# Show the interactive animation plot
fig.show()

Carat Vs Price Relationship¶

In [72]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from scipy.stats import ttest_ind
In [73]:
fig = px.scatter(df, x='carat', y='price', animation_frame='cut',
                 range_x=[0, df['carat'].max()+1],
                 range_y=[0, df['price'].max()+1000],
                 labels={'carat': 'Carat', 'price': 'Price'},
                 title="Carat vs Price Relationship by Cut")

# Set layout properties for aesthetics 
fig.update_layout(
    showlegend=False,
    width=800,
    height=500,
    xaxis=dict(title=dict(text='Carat')),
    yaxis=dict(title=dict(text='Price'))
)

# Show the animated scatter plot 
fig.show()
In [74]:
fig = px.scatter(df,
                 x="carat",
                 y="price",
                 animation_frame="clarity",
                 range_x=[0, 5],
                 range_y=[0, 20000])
fig.show()

Insights/Observations:¶

From the animated plots above (Matplotlib & Plotly), we can observe the changing relationship between carat and price over time.¶

As carat increases, there is generally a positive correlation with price.¶

In [75]:
# Statistical Tests:
significance_level = 0.05

T-test for comparing average prices based on cut categories¶

In [76]:
cut_categories = df["cut"].unique()
for i in range(len(cut_categories)-1):
    for j in range(i+1,len(cut_categories)):
        category_1_price = df[df["cut"]==cut_categories[i]]["price"]
        category_2_price = df[df["cut"]==cut_categories[j]]["price"]
        t_statistic, p_value = ttest_ind(category_1_price, category_2_price)
        
        print(f"T-test between {cut_categories[i]} and {cut_categories[j]}:")
        print(f"    T-Statistic: {t_statistic}")
        print(f"    P-value: {p_value}")
        if p_value < significance_level:
            print("    Null hypothesis rejected. There is a significant difference in average prices.")
        else:
            print("    Null hypothesis accepted. There is no significant difference in average prices.")
T-test between Ideal and Premium:
    T-Statistic: -25.650910588183198
    P-value: 8.585274269295433e-144
    Null hypothesis rejected. There is a significant difference in average prices.
T-test between Ideal and Good:
    T-Statistic: -7.871461326266651
    P-value: 3.638743214736485e-15
    Null hypothesis rejected. There is a significant difference in average prices.
T-test between Ideal and Very Good:
    T-Statistic: -11.965879837440761
    P-value: 6.255962296590797e-33
    Null hypothesis rejected. There is a significant difference in average prices.
T-test between Ideal and Fair:
    T-Statistic: -9.19948373061459
    P-value: 3.892181603768655e-20
    Null hypothesis rejected. There is a significant difference in average prices.
T-test between Premium and Good:
    T-Statistic: 9.4221127821383
    P-value: 4.922340877355623e-21
    Null hypothesis rejected. There is a significant difference in average prices.
T-test between Premium and Very Good:
    T-Statistic: 11.619045834785702
    P-value: 3.9409266225202815e-31
    Null hypothesis rejected. There is a significant difference in average prices.
T-test between Premium and Fair:
    T-Statistic: 2.0034976676632996
    P-value: 0.04514138588179719
    Null hypothesis rejected. There is a significant difference in average prices.
T-test between Good and Very Good:
    T-Statistic: -0.8085850944660599
    P-value: 0.41876516590101354
    Null hypothesis accepted. There is no significant difference in average prices.
T-test between Good and Fair:
    T-Statistic: -4.098387593811541
    P-value: 4.210907188664372e-05
    Null hypothesis rejected. There is a significant difference in average prices.
T-test between Very Good and Fair:
    T-Statistic: -3.6495164503086652
    P-value: 0.00026371160550971715
    Null hypothesis rejected. There is a significant difference in average prices.

Insights/Observations Made during Analysis:¶

There is no null values in the dataset¶

The distribution of price,carat is right skewed and there are outliers¶

The count of ideal cut, G color, SI1 clarity are more in this datset¶

The price for 'J' color with premium cut is high and 'D' color with ideal cut is low¶

The price for D color with 'IF' clarity is high and price for E color with 'ws1' is low, as we know the 'D' color and 'IF' clarity are best. So, together they are increasing the price.¶

The price for premium cut,'J' color and 'SI2' are high¶

The SI2 type clarity has more price and WS1 has low price¶

The price and carat have strong positive relationship¶

The features like x,y,z have strong positive relationship with carat and price¶

The carat,cut,color,clarity,x,y,z these features are more influencing price¶

In [78]:
df.columns
Out[78]:
Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',
       'price', 'x', 'y', 'z'],
      dtype='object')
In [79]:
fig = px.scatter(df, x='carat', y='price', animation_frame='cut',
                 range_x=[0, df['carat'].max()+1],
                 range_y=[0, df['price'].max()+1000],
                 labels={'carat': 'Carat', 'price': 'Price'},
                 title="Carat vs Price Relationship by Cut")

# Set layout properties for aesthetics 
fig.update_layout(
    showlegend=False,
    width=800,
    height=500,
    xaxis=dict(title=dict(text='Carat')),
    yaxis=dict(title=dict(text='Price'))
)

# Show the animated scatter plot 
fig.show()
In [80]:
data = df.groupby(['cut', 'color'])['price'].mean().reset_index()
In [81]:
import pandas as pd
import plotly.express as px

df = pd.read_csv('diamonds.csv')
In [82]:
grouped_df = df.groupby(['cut', 'color']).mean()['price'].reset_index()
C:\Users\alexa\AppData\Local\Temp\ipykernel_19288\1141413756.py:1: FutureWarning:

The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.

In [83]:
fig = px.imshow(grouped_df.pivot(index='color', columns='cut', values='price'),
                labels=dict(x="Cut", y="Color", color="Price"),
                x=['Fair', 'Good', 'Ideal', 'Premium','Very Good'],
                y=['D','E','F','G','H','I','J'],
                title="Diamond Price Heatmap")

fig.update_layout(
    updatemenus=[
        dict(
            type="buttons",
            buttons=[dict(label="Play",
                          method="animate",
                          args=[None,
                                {"frame": {"duration": 1000, "redraw": True},
                                 "fromcurrent": True,
                                 "transition": {"duration": 500}}]),
                    dict(label="Pause",
                         method="animate",
                         args=[[None],
                               {"frame": {"duration": 0, "redraw": False},
                                "mode":"immediate"}])]
        )
    ])

frames = []
for cut in grouped_df['cut'].unique():
    frame_data = grouped_df[grouped_df['cut'] == cut].pivot(index='color',
                                                            columns='cut',
                                                            values='price').values.tolist()
    frames.append(dict(data=[dict(z=frame_data,
                                  type='heatmap')],
                       name=cut))

fig.frames = frames

fig.show()

The End¶

In [ ]: